package au.com.acpfg.misc.jemboss.io; import java.io.File; import java.io.FileInputStream; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.HashMap; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.knime.base.node.util.BufferedFileReader; import org.knime.core.data.DataCell; import org.knime.core.data.DataColumnSpec; import org.knime.core.data.DataColumnSpecCreator; import org.knime.core.data.DataTableSpec; import org.knime.core.data.DataType; import org.knime.core.data.def.DefaultRow; import org.knime.core.data.def.StringCell; import org.knime.core.node.BufferedDataContainer; import org.knime.core.node.CanceledExecutionException; import org.knime.core.node.InvalidSettingsException; import au.com.acpfg.misc.jemboss.local.AbstractTableMapper; import au.com.acpfg.misc.jemboss.settings.ProgramSetting; /** * Responsible for decoding an emboss program which produces a FASTA style file (eg. seqoutall program setting) * * @author andrew.cassin * */ public class FastaUnmarshaller implements UnmarshallerInterface { private static final Pattern accsn_pattern = Pattern.compile("^(\\S+)\\b"); private static final Pattern descr_pattern = Pattern.compile("^\\S+\\s*(.*)$"); private static int id = 0; public void addColumns(AbstractTableMapper atm, ProgramSetting for_this ) { ArrayList<DataColumnSpec> vec = new ArrayList<DataColumnSpec>(); String basename = for_this.getName()+":"; vec.add(new DataColumnSpecCreator(basename+"Accession", StringCell.TYPE).createSpec()); vec.add(new DataColumnSpecCreator(basename+"Description", StringCell.TYPE).createSpec()); vec.add(new DataColumnSpecCreator(basename+"Sequence", StringCell.TYPE).createSpec()); atm.addFormattedColumns(for_this, vec); atm.addRawColumn(for_this, new DataColumnSpecCreator(basename+"Raw output", StringCell.TYPE).createSpec()); } @Override public void process(ProgramSetting for_this, InputStream emboss_prog_output_stream, AbstractTableMapper atm) throws IOException,InvalidSettingsException { String basename = for_this.getName()+":"; DataTableSpec spec_formatted = atm.getFormattedTableSpec(); String name_row = "RowID"; String name_id = basename+"Accession"; String name_descr= basename+"Description"; String name_seq = basename+"Sequence"; String rid = atm.getCurrentRow(); BufferedFileReader rseq = BufferedFileReader.createNewReader(emboss_prog_output_stream); String line = null; boolean done = false; boolean already_got_header = false; StringBuffer seq = null; String[] accsn = null; String[] descr = null; StringBuffer raw = new StringBuffer(10*1024); while (!done) { // get header line if (!already_got_header) { do { line = rseq.readLine(); if (line == null) { done = true; break; } } while (!line.startsWith(">")); } if (!done) { raw.append(line); raw.append('\n'); String[] entries = line.split("\\x01"); if (entries.length > 0 && entries[0].startsWith(">")) { entries[0] = entries[0].substring(1); // skip over > for parse_accession() } accsn = parse_accession(accsn_pattern,entries); descr = parse_description(descr_pattern,entries); String tline; seq = new StringBuffer(10 * 1024); boolean got_seq = false; already_got_header = false; int tline_len = 0; do { if ((line = rseq.readLine()) == null) { already_got_header = false; break; } tline = line.trim(); tline_len = tline.length(); if (tline_len > 0) { char first_c = tline.charAt(0); if (first_c == '>') { got_seq = false; already_got_header = true; break; } if (Character.isLetter(first_c) || first_c == '*' || first_c == '-') { seq.append(tline); raw.append(tline); raw.append('\n'); got_seq = true; } } } while (tline_len == 0 || got_seq ); } // save the sequence to the container HashMap<String,DataCell> cellmap = new HashMap<String,DataCell>(); if (!done && seq != null && accsn != null && descr != null) { cellmap.put(name_row, new StringCell(rid)); cellmap.put(name_id, new StringCell(accsn[0])); cellmap.put(name_descr, new StringCell(descr[0])); cellmap.put(name_seq, new StringCell(seq.toString())); atm.setFormattedCells(cellmap); atm.emitFormattedRow(); } } rseq.close(); atm.setRawOutputCell(for_this, new StringCell(raw.toString())); // NB: emitting of a raw row is done once all unmarshalling is done (ie. all settings processed) } protected String[] parse_accession(Pattern matcher, String[] entries) throws InvalidSettingsException { int cnt = 0; String[] accsns = new String[entries.length]; for (String entry : entries) { Matcher m = matcher.matcher(entry); if (m.find()) { if (m.groupCount() != 1) { throw new InvalidSettingsException("You must use capturing parentheses () to match an accession only once!"); } accsns[cnt] = m.group(1); cnt++; } } if (cnt < entries.length) { accsns[cnt] = null; // make sure array has null after last match } return (cnt > 0) ? accsns : null; } protected String[] parse_description(Pattern matcher, String[] entries) throws InvalidSettingsException { int cnt = 0; String[] descrs = new String[entries.length]; for (String entry : entries) { Matcher m = matcher.matcher(entry); if (m.find()) { if (m.groupCount() != 1) { throw new InvalidSettingsException("You must use capturing parentheses() to match a sequence description only once!"); } descrs[cnt] = m.group(1); cnt++; } } if (cnt < entries.length) { descrs[cnt] = null; } return (cnt > 0) ? descrs : null; } }